Below is a list of the top comutating genes with KRAS in CRC. The last four columns have the number of tunor samples with the various combination of mutations; for example, G mut & K WT has the number of tumors with the other gene (G) mutated and KRAS (K) WT.
nonallele_specific_increased_comutation_df %.% {
filter(cancer == "COAD" & hugo_symbol != "KRAS")
filter(p_value < 0.01)
arrange(p_value)
mutate(
p_value = scales::scientific(p_value, digits = 3),
odds_ratio = round(odds_ratio, digits = 3),
geneWT_krasWT = map_dbl(comut_ct_tbl, ~ .x[1, 1]),
geneMut_krasWT = map_dbl(comut_ct_tbl, ~ .x[2, 1]),
geneWT_krasMut = map_dbl(comut_ct_tbl, ~ .x[1, 2]),
geneMut_krasMut = map_dbl(comut_ct_tbl, ~ .x[2, 2]),
)
select(hugo_symbol, p_value, odds_ratio, tidyselect::starts_with("gene"))
rename(
gene = hugo_symbol,
`p-value` = p_value,
`odds ratio` = odds_ratio,
`G WT & K WT` = geneWT_krasWT,
`G mut & K WT` = geneMut_krasWT,
`G WT & K mut` = geneWT_krasMut,
`G mut & K mut` = geneMut_krasMut
)
}
The data used for this analysis was the RNA expression data from TCGA-COAD. This data set has RNA expression for 525 tumor samples. 78 of these samples are hypermutants; these samples were removed from the analysis.
main_alleles <- c("WT", "KRAS_G12A", "KRAS_G12C", "KRAS_G12D", "KRAS_G12V", "KRAS_G13D")
dusp_rna_data <- tcga_coad_rna %.% {
filter(str_detect(hugo_symbol, "DUSP"))
filter(!is_hypermutant)
select(-is_hypermutant)
mutate(
allele = ifelse(ras_allele %in% !!main_alleles, ras_allele, "Other"),
allele = str_remove(allele, "KRAS_"),
allele = factor_alleles(allele)
)
}
# Put DUSP genes in order.
dusp_order <- unique(dusp_rna_data$hugo_symbol)
dusp_num <- as.numeric(str_remove_all(dusp_order, "[:alpha:]"))
dusp_order <- dusp_order[order(dusp_num)]
dusp_rna_data$hugo_symbol <- factor(dusp_rna_data$hugo_symbol, levels = dusp_order)
Below is a sample of the RNA expression data table.
dusp_rna_data %>%
rename(
`DUSP` = hugo_symbol,
`tumor sample` = tumor_sample_barcode,
`RNA (RSEM)` = rna_expr
) %>%
select(-ras_allele) %>%
head() %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| DUSP | tumor sample | RNA (RSEM) | allele |
|---|---|---|---|
| DUSP10 | TCGA-3L-AA1B | 254.352 | WT |
| DUSP10 | TCGA-4N-A93T | 133.527 | G12D |
| DUSP10 | TCGA-4T-AA8H | 275.635 | G12V |
| DUSP10 | TCGA-5M-AAT4 | 507.745 | G12D |
| DUSP10 | TCGA-5M-AAT5 | 572.762 | G12D |
| DUSP10 | TCGA-5M-AATA | 209.145 | WT |
The number of tumor samples with missing data for each DUSP gene.
dusp_rna_data %>%
filter(is.na(rna_expr)) %>%
count(hugo_symbol, sort = TRUE) %>%
pivot_wider(names_from = hugo_symbol, values_from = n) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE,
position = "left"
)
| DUSP13 | DUSP21 |
|---|---|
| 149 | 149 |
The number of tumor samples with 0 RNA expression values for each DUSP gene.
dusp_rna_data %>%
filter(rna_expr <= 0) %>%
count(hugo_symbol, sort = TRUE) %>%
pivot_wider(names_from = hugo_symbol, values_from = n) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE,
position = "left"
)
| DUSP21 | DUSP13 | DUSP5P | DUSP26 | DUSP9 | DUSP27 | DUSP15 |
|---|---|---|---|---|---|---|
| 284 | 217 | 73 | 58 | 34 | 11 | 2 |
All negative RNA expression values were set to 0.
dusp_rna_data %<>% mutate(rna_expr = map_dbl(rna_expr, ~ max(0, .x)))
dusp_rna_data %>%
filter(!is.na(rna_expr)) %>%
mutate(rna_expr = rna_expr + 1) %>%
ggplot(aes(x = allele, y = rna_expr)) +
facet_wrap(~hugo_symbol, scales = "free_y") +
geom_boxplot(outlier.shape = NA) +
scale_y_continuous(trans = "log10") +
theme(
panel.grid.major.x = element_blank(),
axis.text = element_text(size = 7),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
labs(x = NULL, y = "RNA expression (log10 + 1)")
DUSP13 and DUSP21 were removed from the analysis because they were missing data and had very low expression levels.
IGNORE_DUSPS <- c("DUSP13", "DUSP21")
dusp_rna_data %<>% filter(!hugo_symbol %in% IGNORE_DUSPS)
plot_dusp_distribtions <- function(df, x) {
df %>%
ggplot(aes(x = {{ x }})) +
facet_wrap(~hugo_symbol, scales = "free") +
scale_y_continuous(expand = expansion(c(0, 0.02))) +
geom_density() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text = element_text(size = 6),
axis.text.x = element_text(angle = 30, hjust = 1),
strip.text = element_text(size = 7),
panel.spacing = unit(1, "mm")
) +
labs(x = "RNA expression")
}
plot_dusp_distribtions(dusp_rna_data, rna_expr) +
ggtitle("Non-transformed RNA expression values")
dusp_rna_data %>%
mutate(rna_expr = log10(rna_expr + 1)) %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("log10(RNA + 1) transformed data")
dusp_rna_data %>%
mutate(rna_expr = sqrt(rna_expr)) %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("sqrt(RNA) transformed data")
dusp_rna_data %>%
group_by(hugo_symbol) %>%
mutate(rna_expr = scale_numeric(rna_expr)) %>%
ungroup() %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("Centralized and scaled (z-scale) data")
dusp_rna_data %>%
mutate(rna_expr = sqrt(rna_expr)) %>%
group_by(hugo_symbol) %>%
mutate(rna_expr = scale_numeric(rna_expr)) %>%
ungroup() %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("sqrt(RNA) & z-scaled data")
The \(\log_{10}(\text{RNA} + 1)\), centralized, and scaled values will be used for the analysis.
dusp_rna_data %<>%
mutate(log10_rna_expr = log10(rna_expr + 1)) %>%
group_by(hugo_symbol) %>%
mutate(log10_z_rna = scale_numeric(log10_rna_expr)) %>%
ungroup()
dusp_corr <- dusp_rna_data %>%
pivot_wider(id = tumor_sample_barcode, names_from = hugo_symbol, values_from = log10_z_rna) %>%
select(-tumor_sample_barcode) %>%
corrr::correlate()
dusp_corr_pheat <- dusp_corr %>%
as.data.frame() %>%
column_to_rownames("rowname")
colnames(dusp_corr_pheat) <- str_remove(colnames(dusp_corr_pheat), "DUSP")
rownames(dusp_corr_pheat) <- str_remove(rownames(dusp_corr_pheat), "DUSP")
pheatmap::pheatmap(
dusp_corr_pheat,
border_color = NA,
na_col = "white",
main = "Correlation of DUSP gene expression",
)
corrr::network_plot(dusp_corr, min_cor = 0.3) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "DUSP gene expression correlation network")
new_allele_order <- as.character(sort(unique(dusp_rna_data$allele)))
new_allele_order <- c("WT", new_allele_order[new_allele_order != "WT"])
data <- dusp_rna_data %>%
mutate(
grp_allele = case_when(
allele == "G13D" ~ "G13D",
allele == "WT" ~ "WT",
TRUE ~ "KRAS"
),
grp_allele = factor(grp_allele, levels = c("WT", "G13D", "KRAS")),
allele = factor(as.character(allele), levels = new_allele_order)
)
# FOR TESTING
if (FALSE) {
set.seed(0)
TESTING_DUSPS <- paste0("DUSP", 1:6)
TESTING_TSBS <- sample(unique(data$tumor_sample_barcode), 100)
data <- data %.% {
filter(hugo_symbol %in% TESTING_DUSPS)
filter(tumor_sample_barcode %in% TESTING_TSBS)
}
}
The follow model was constructed and fit on the \(log_{10}\) and z-scaled RNA expression. The model has a single global intercept and an additional varying intercept and slope for each DUSP. The model also contains a covariate for the correlation between varying intercepts and slopes, but I left this out of the model, for simplicity.
\[ RNA_z \sim \mathcal{N}(\mu, \sigma) \\ \mu = \alpha + \alpha_{DUSP} + \beta K_{DUSP} \\ \alpha \sim \mathcal{N}(0, 2) \quad \alpha_{DUSP} \sim \mathcal{N}(0, 2) \quad K_{DUSP} \sim \mathcal{N}(0, 2) \\ \sigma \sim \text{Exp}(1) \]
stash("lm_stan_hier", depends_on = "data", {
lm_stan_hier <- stan_glmer(
log10_z_rna ~ 1 + (1 + allele | hugo_symbol),
data = data,
prior = normal(location = 0, scale = 2),
prior_intercept = normal(location = 0, scale = 2),
prior_aux = exponential(rate = 1),
prior_covariance = decov(regularization = 1, concentration = 1, shape = 1, scale = 1)
)
})
#> Loading stashed object.
Trace plots for the global intercept and standard deviation \(\sigma\).
mcmc_trace(lm_stan_hier, pars = "(Intercept)") /
mcmc_trace(lm_stan_hier, pars = "sigma")
Trace plots for parameters for DUSP1.
mcmc_trace(lm_stan_hier, regex_pars = "DUSP1\\]") +
scale_x_continuous(expand = c(0, 0))
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
pp_check(lm_stan_hier, plotfun = "stat", stat = "mean") +
ggtitle("Distrbition of error of the posterior predictions")
pp_check(lm_stan_hier, plotfun = "stat_2d", stat = c("mean", "sd")) +
ggtitle("Standard deviation and mean of the error of the posterior predictions")
The posterior distribution of the parameters of varying effects in the model.
lm_dusp_post <- as.data.frame(lm_stan_hier) %.% {
mutate(draw = row_number())
select(draw, `(Intercept)`, tidyselect::contains("DUSP"))
pivot_longer(
-c(draw, `(Intercept)`),
names_to = "parameter",
values_to = "value"
)
mutate(
parameter = str_remove_all(parameter, "[:punct:]"),
parameter = str_remove_all(parameter, "b|allele|hugosymbol|")
)
separate(parameter, c("allele", "dusp"), sep = " ")
mutate(allele = str_replace(allele, "Intercept", "WT"))
}
lm_dusp_post %>%
ggplot(aes(x = value)) +
facet_wrap(~dusp, ncol = 4, scales = "free") +
geom_vline(
xintercept = 0,
lty = 2,
color = "grey50",
size = 0.9
) +
geom_density(aes(color = allele), size = 1) +
scale_color_manual(values = short_allele_pal) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = expansion(c(0, 0.02))) +
labs(
x = "posterior distribution",
y = "probability density",
color = "KRAS allele"
)
The following plot presents a summary of the distributions (the curves shown above). Each plot shows each KRAS allele’s parameter for its association with DUSP expression. The point represents the median of the posterior and the bars represent the 50% and 89% highest density intervals (credible intervals). If all of these are outside of the region of practical equivalence (ROPE; the shaded region), we can say that we are confident that the real parameter value is non-zero and is of meaningful size (assuming the model design).
summarise_to_hdi <- function(df, value_col) {
df %.% {
summarise(
avg = median({{ value_col }}),
hdi_50 = list(unlist(ggdist::hdi({{ value_col }}, .width = 0.50))),
hdi_89 = list(unlist(ggdist::hdi({{ value_col }}, .width = 0.89))),
)
ungroup()
mutate(
hdi_50_lower = map_dbl(hdi_50, ~ .x[[1]]),
hdi_50_upper = map_dbl(hdi_50, ~ .x[[2]]),
hdi_89_lower = map_dbl(hdi_89, ~ .x[[1]]),
hdi_89_upper = map_dbl(hdi_89, ~ .x[[2]])
)
select(-hdi_50, -hdi_89)
}
}
linerange_plots_parameter_values <- function(p) {
p +
geom_rect(
xmin = -0.1, xmax = 0.1, ymin = Inf, ymax = -Inf,
fill = "grey80",
color = NA,
alpha = 0.1
) +
geom_vline(xintercept = 0, color = "grey50") +
geom_point(size = 2) +
geom_linerange(
aes(xmin = hdi_50_lower, xmax = hdi_50_upper),
size = 1.2,
alpha = 0.8
) +
geom_linerange(
aes(xmin = hdi_89_lower, xmax = hdi_89_upper),
size = 0.9,
alpha = 0.5
)
}
p <- lm_dusp_post %.%
{
group_by(dusp, allele)
summarise_to_hdi(value)
ungroup()
} %>%
ggplot(aes(x = avg, y = allele, color = allele)) +
facet_wrap(~dusp, scales = "free_x") +
scale_color_manual(values = short_allele_pal, drop = TRUE) +
theme(legend.position = "none") +
labs(
x = "posterior distributions",
y = NULL
)
linerange_plots_parameter_values(p)
The following table contains all of the data presented in the above plot:
bayestestR::describe_posterior(lm_stan_hier, effects = "all") %>%
arrange(Effects) %>%
as_tibble() %>%
mutate(
Parameter = str_remove_all(Parameter, "[:punct:]"),
Parameter = str_remove_all(Parameter, "^b|allele|hugosymbol"),
) %>%
mutate_if(is.numeric, round, digits = 2)
#> Possible multicollinearity between Sigma[hugo_symbol:alleleG12D,(Intercept)] and Sigma[hugo_symbol:(Intercept),(Intercept)] (r = 0.79), Sigma[hugo_symbol:alleleG12V,(Intercept)] and Sigma[hugo_symbol:(Intercept),(Intercept)] (r = 0.71), Sigma[hugo_symbol:alleleG12D,alleleG12D] and Sigma[hugo_symbol:alleleG12D,(Intercept)] (r = 0.77), Sigma[hugo_symbol:alleleG12V,alleleG12V] and Sigma[hugo_symbol:alleleG12V,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleG13D,alleleG12D] and Sigma[hugo_symbol:alleleG13D,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleG13D,alleleG13D] and Sigma[hugo_symbol:alleleG13D,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleOther,alleleG12D] and Sigma[hugo_symbol:alleleOther,(Intercept)] (r = 0.74). This might lead to inappropriate results. See 'Details' in '?rope'.
A useful check to make sure the model fit well is to have it make predictions for the RNA expression of each DUSP for each KRAS allele by sampling from the posterior distributions of the parameters and feeding them through the model’s equation. The predicted distributions should resemble the real distributions. These look good to me.
pred_data <- data %>% distinct(hugo_symbol, allele)
y_pred <- rstanarm::posterior_predict(lm_stan_hier,
newdata = pred_data
)
lm_dusp_ppc_hpi <- apply(y_pred, 2, function(x) {
y <- bayestestR::hdi(x, ci = 0.89, verbose = FALSE)
return(tibble(ppc_ci_low_z = y$CI_low, ppc_ci_high_z = y$CI_high))
}) %>%
bind_rows() %>%
mutate(ppc_avg_z = apply(y_pred, 2, mean)) %>%
bind_cols(pred_data)
data_original_value_summary <- data %>%
group_by(hugo_symbol) %>%
summarise(
original_avg = mean(log10_rna_expr),
original_sd = sd(log10_rna_expr)
) %>%
ungroup()
lm_dusp_postpred <- lm_dusp_ppc_hpi %>%
left_join(data_original_value_summary, by = "hugo_symbol") %>%
mutate(
ppc_avg = 10**((ppc_avg_z * original_sd) + original_avg),
ppc_ci_low = 10**((ppc_ci_low_z * original_sd) + original_avg),
ppc_ci_high = 10**((ppc_ci_high_z * original_sd) + original_avg)
) %>%
ungroup()
lm_dusp_postpred %>%
ggplot(aes(x = allele)) +
facet_wrap(~hugo_symbol, nrow = 5, scales = "free_y") +
geom_jitter(
aes(y = rna_expr),
data = data,
color = "grey50", alpha = 0.2, size = 0.2,
height = 0, width = 0.25
) +
geom_linerange(
aes(ymin = ppc_ci_low, ymax = ppc_ci_high),
size = 0.5, color = "red", alpha = 0.5
) +
geom_point(
aes(y = ppc_avg),
size = 1, color = "red"
) +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1)
) +
labs(x = NULL, y = "RNA expression", title = "Posterior predictions")
The following analysis tries to identify an association between DUSP gene expression and whether or not a tumor sample has an APC mutation.
apc_dusp_rna_data <- coad_apc_mutations %>%
group_by(tumor_sample_barcode) %>%
summarise(
apc_mutations = list(mutation_type),
apc_aa_changes = list(amino_acid_change)
) %>%
ungroup() %>%
add_column(is_apc_mut = TRUE) %>%
right_join(dusp_rna_data, by = "tumor_sample_barcode") %>%
mutate(is_apc_mut = ifelse(is.na(is_apc_mut), FALSE, is_apc_mut))
plot_count <- function(df, x, x_lbl, title, nudge_y = 0) {
ggplot(df, aes(x = {{ x }}, y = n)) +
geom_col() +
geom_text(
aes(label = scales::comma(n, accuracy = 1)),
nudge_y = nudge_y,
size = 3.5,
vjust = 0
) +
scale_x_discrete(labels = function(x) {
str_replace_all(x, "_", " ")
}) +
scale_y_continuous(
expand = expansion(mult = c(0, 0.02), add = c(0, nudge_y))
) +
theme(
plot.title = element_markdown()
) +
labs(x = x_lbl, y = "count", title = title)
}
apc_dusp_rna_data %.%
{
mutate(
apc_mutation = map_chr(apc_mutations, function(x) {
if (is.null(x)) {
return("WT")
} else if (length(x) == 1) {
return(x)
} else {
return("multiple")
}
})
)
count(apc_mutation)
mutate(apc_mutation = fct_reorder(apc_mutation, -n))
} %>%
plot_count(
apc_mutation,
x_lbl = "APC mutations",
title = "Frequency of types of mutations to *APC*",
nudge_y = 40
)
apc_dusp_rna_data %.%
{
unnest(apc_mutations)
rename(apc_mutation = apc_mutations)
count(apc_mutation)
mutate(apc_mutation = fct_reorder(apc_mutation, -n))
} %>%
plot_count(
apc_mutation,
x_lbl = "APC mutations",
title = "Frequency of types of mutations to *APC*",
nudge_y = 40
)
apc_mut_pal <- c("TRUE" = "#1D71C9", "FALSE" = "#ED1442")
apc_mut_lbl <- c("TRUE" = "APC mut.", "FALSE" = "WT")
apc_dusp_rna_data %.%
{
mutate(rna_expr = rna_expr + 1)
} %>%
ggplot(aes(x = is_apc_mut, y = rna_expr)) +
facet_wrap(~hugo_symbol, scales = "free_y") +
geom_jitter(aes(color = is_apc_mut), size = 0.1, alpha = 0.5) +
geom_boxplot(
aes(color = is_apc_mut),
size = 0.7, outlier.shape = NA, fill = "white", alpha = 0.4
) +
scale_x_discrete(labels = apc_mut_lbl) +
scale_y_continuous(trans = "log10") +
scale_color_manual(
values = apc_mut_pal,
labels = apc_mut_lbl,
guide = guide_legend(override.aes = list(size = 1, alpha = 1))
) +
theme(
panel.grid.major.x = element_blank(),
axis.text = element_text(size = 7),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_blank()
) +
labs(x = NULL, y = "RNA expression (log10 + 1)")
pos <- position_jitterdodge(
jitter.width = 0.2, jitter.height = 0,
dodge.width = 0.8
)
apc_dusp_rna_data %.%
{
mutate(rna_expr = rna_expr + 1)
} %>%
ggplot(aes(x = hugo_symbol, y = rna_expr)) +
geom_jitter(aes(color = is_apc_mut), position = pos, size = 0.1, alpha = 0.5) +
geom_boxplot(aes(color = is_apc_mut), outlier.shape = NA, fill = "white", alpha = 0.4) +
scale_y_continuous(trans = "log10") +
scale_color_manual(
values = apc_mut_pal,
labels = apc_mut_lbl
) +
theme(
panel.grid.major.x = element_blank(),
axis.text = element_text(size = 7),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_blank()
) +
labs(x = NULL, y = "RNA expression (log10 + 1)")
apc_dusp_rna_data %.%
{
mutate(rna_expr = rna_expr + 1)
} %>%
ggplot(aes(x = rna_expr)) +
facet_wrap(~hugo_symbol, scales = "free", ncol = 4) +
geom_density(aes(color = is_apc_mut, fill = is_apc_mut), size = 1, alpha = 0.2) +
scale_x_log10() +
scale_y_continuous(expand = expansion(mult = c(0, 0.02))) +
scale_color_manual(
values = apc_mut_pal,
labels = apc_mut_lbl
) +
scale_fill_manual(
values = apc_mut_pal,
labels = apc_mut_lbl
) +
theme(
legend.title = element_blank()
) +
labs(x = "RNA expression (log10 + 1)")
From the above density plots, I think it would be best to only model a selection of the DUSPs. At the very least, the DUSPs with clear bi-modal distributions should be removed as they cannot be modeled as a Gaussian distribution. The following analysis will be conducted with only DUSP4, DUSP6, DUSP9, DUSP15, DUSP19, and DUSP28.
MODEL_DUSPS <- c("DUSP4", "DUSP6", "DUSP9", "DUSP15", "DUSP19", "DUSP28")
data <- apc_dusp_rna_data %.% {
filter(hugo_symbol %in% MODEL_DUSPS)
}
# FOR TESTING
if (FALSE) {
warning("IN TESTING MODE.")
set.seed(0)
TESTING_TSBS <- sample(unique(data$tumor_sample_barcode), 100)
data <- data %.% {
filter(tumor_sample_barcode %in% TESTING_TSBS)
}
}
Below is the model that is fit to analyze the association of APC mutation with differential DUSP expression. It is a hierarchical model with a varying intercept and slope for each DUSP. There is also a correlation term that is left out of the formula for simplicity. It measures the correlation between the varying intercepts and slopes.
\[ RNA_z \sim \mathcal{N}(\mu, \sigma) \\ \mu = \alpha + \alpha_{DUSP} + \beta A_{DUSP} \\ \alpha \sim \mathcal{N}(0, 2) \quad \alpha_{DUSP} \sim \mathcal{N}(0, 2) \quad A_{DUSP} \sim \mathcal{N}(0, 2) \\ \sigma \sim \text{Exp}(1) \]
stash("apc_lm_stan_hier", depends_on = "data", {
apc_lm_stan_hier <- stan_glmer(
log10_z_rna ~ 1 + (1 + is_apc_mut | hugo_symbol),
data = data,
prior = normal(location = 0, scale = 2),
prior_intercept = normal(location = 0, scale = 2),
prior_aux = exponential(rate = 1),
prior_covariance = decov(regularization = 1, concentration = 1, shape = 1, scale = 1)
)
})
#> Loading stashed object.
Trace plots for the global intercept and standard deviation \(\sigma\).
mcmc_trace(apc_lm_stan_hier, pars = "(Intercept)") /
mcmc_trace(apc_lm_stan_hier, pars = "sigma")
Trace plots for parameters.
mcmc_trace(apc_lm_stan_hier) +
scale_x_continuous(expand = c(0, 0))
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
pp_check(apc_lm_stan_hier, plotfun = "stat", stat = "mean") +
ggtitle("Distrbition of error of the posterior predictions")
pp_check(apc_lm_stan_hier, plotfun = "stat_2d", stat = c("mean", "sd")) +
ggtitle("Standard deviation and mean of the error of the posterior predictions")
Like above for the KRAS alleles, the posterior distributions of the parameters for APC mutation status are summarized below.
apc_lm_post <- as.data.frame(apc_lm_stan_hier) %.% {
mutate(sample_idx = row_number())
pivot_longer(-c(sample_idx, `(Intercept)`))
filter(str_detect(name, "^b"))
mutate(name = str_remove_all(name, "^b|\\[|\\]"))
separate(name, into = c("variable", "dusp"), sep = " ")
mutate(
variable = ifelse(variable == "(Intercept)", "apc_wt", "apc_mut"),
dusp = str_remove(dusp, "^hugo_symbol:"),
dusp_num = as.numeric(str_remove(dusp, "DUSP")),
dusp = fct_reorder(dusp, dusp_num)
)
select(-dusp_num)
}
p <- apc_lm_post %.%
{
group_by(variable, dusp)
summarise_to_hdi(value)
ungroup()
mutate(is_apc_mut = variable == "apc_mut")
} %>%
ggplot(aes(x = avg, y = is_apc_mut, color = is_apc_mut)) +
facet_wrap(~dusp, scales = "free_x") +
scale_color_manual(
values = apc_mut_pal,
labels = apc_mut_lbl
) +
scale_y_discrete(labels = apc_mut_lbl) +
theme(legend.position = "none") +
labs(
x = "posterior distributions",
y = NULL
)
linerange_plots_parameter_values(p)
The strongest associations seems to be the reduced DUSP4 expression and increased DUSP15 expression are associated with APC mutation. Recall that the expression of these two genes are negatively correlated. The transformed expression of these genes for each tumor sample (each point) is plotted below and the tumors are colored by whether or not they have an APC mutation. There may be a trend here, but it appears to be slight.
plot_data <- apc_dusp_rna_data %.% {
filter(hugo_symbol %in% paste0("DUSP", c(4, 15)))
select(tumor_sample_barcode, hugo_symbol, is_apc_mut, log10_z_rna)
pivot_wider(id_cols = c(tumor_sample_barcode, is_apc_mut), names_from = hugo_symbol, values_from = log10_z_rna)
}
plot_data_summary <- plot_data %.% {
group_by(is_apc_mut)
summarise_if(is.numeric, mean)
ungroup()
}
plot_data %>%
ggplot(aes(x = DUSP4, y = DUSP15, color = is_apc_mut)) +
geom_hline(yintercept = 0, color = "grey70", size = 0.6) +
geom_vline(xintercept = 0, color = "grey70", size = 0.6) +
geom_point(
aes(fill = is_apc_mut),
data = plot_data_summary,
size = 8, shape = 23, show.legend = FALSE
) +
geom_point(
size = 2,
alpha = 0.6
) +
scale_color_manual(
values = apc_mut_pal,
labels = apc_mut_lbl
) +
scale_fill_manual(
values = apc_mut_pal,
labels = apc_mut_lbl,
guide = NULL
) +
theme(
plot.title = element_markdown(),
axis.title.x = element_markdown(),
axis.title.y = element_markdown()
) +
labs(
x = "DUSP4 (log<sub>10</sub> and z-scaled)",
y = "DUSP15 (log<sub>10</sub> and z-scaled)",
color = NULL, fill = NULL,
title = "Clustering of APC mutants with lower *DUSP4* and higher *DUSP15* expression"
)